Crime Analysis

Author

Kristina Polonska, Mariana Konovalenko, Taras Baraniuk

Published

September 10, 2026

Import Libraries

show code
if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  arrow,
  readr,
  tidyverse,
  lubridate,
  readxl,
  rvest,
  tidyr,
  plotly,
  scales,
  sf,
  janitor,
  stringr,
  patchwork,
  purrr
)

Read the Data

# Download complaint data
complaint_data <- read_csv("data/NYPD_Complaint_Data_Historic_20251019.csv")

complaint_data_full <- read_parquet("data/NYPD_Complaint_Data_Historic_20251021.parquet") |>
  clean_names()

evictions <- read_csv_arrow("data/Evictions_20251019.csv")

df_full_evictions <- read_csv_arrow("data/Evictions_20251021.csv") |>
  clean_names()

Parse the Crime in Ukraine from Wikipedia

# Parse ukrainian crime rates data:
url <- "https://uk.wikipedia.org/wiki/Злочинність_в_Україні"

page <- read_html(url)
tabs <- page |> 
  html_elements("table.wikitable")

crime_ukr <- tabs[[1]] |> 
  html_table(fill = TRUE)

Preprocess the Data

# First group complaint dataset by year and find out number of complaints each year: 
complaint_summary <- complaint_data |>
  mutate(
    date = mdy(CMPLNT_FR_DT), 
    year = year(date)
  ) |>
  filter(year >= 1900) |> 
  rename(
    NYC_district = BORO_NM
  )

# Group by year and borough, count number in each group
by_year_borough <- evictions |>
  mutate(
    exec_date = mdy(`Executed Date`),     
    year = year(exec_date),
    borough = toupper(BOROUGH)
  ) |> 
  count(year, borough, name = "n") |>
  arrange(year, borough) |>
  collect()

# Clean and process ukrainian crime rates data (add extra 2 missing rows):
crime_ukr <- crime_ukr |>
  filter(!is.na(рік), рік >= 2010) |>
  mutate(
    total_crimes = str_replace_all(`всього злочинів`, "\\D", ""),
    total_crimes = na_if(total_crimes, ""),   # keep blanks as NA
    total_crimes = as.integer(total_crimes)
  ) |>
  select(рік, total_crimes)

crime_ukr <- add_row(crime_ukr, рік = 2024, total_crimes = 382335)
crime_ukr <- add_row(crime_ukr, рік = 2025, total_crimes = 473662)

Visualizations

Crime in Ukraine

show code
ggplot(crime_ukr, aes(x = рік, y = total_crimes / 1000)) +
  geom_line(color = "#00CDCD", size = 1.2) +        
  geom_point(color = "#FFB90F", size = 2.5) + 
  scale_x_continuous(breaks = crime_ukr$рік) +
  labs(title = "Total number of crimes committed each year in Ukraine",
       x = NULL,
       y = "Total crimes (in thousands)") +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5),
    panel.grid.minor.x = element_blank()
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

NYC Crime Complaints by Years

show code
crime_year <- complaint_summary |>
  filter(!is.na(year), year >= 2000, year <= 2025) |>
  count(year, name = 'complaints') |>
  arrange(year)

complaint_vis <- ggplot(crime_year, aes(x = year, y = complaints, group = 1, text = paste0(
  'year: ', year,
  '\ncomplaints: ', comma(complaints)
))) +
  geom_line(color = '#2F539B', linewidth = 1) +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(breaks = pretty_breaks(n = 5)) +
  labs(
    title = 'NYC crime complaints by year',
    x = 'year', y = 'number of complaints'
  ) +
  theme_minimal() +
  theme(
    plot.background = element_rect(fill = '#F5F5F5'),
    panel.background = element_rect(fill = '#F5F5F5'),
    panel.grid.major.y = element_line(color = "#728FCE"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    text = element_text(face = 'italic')
  )

complaint_interactive <- ggplotly(complaint_vis, tooltip = 'text') |>
  layout(
    hovermode = 'x unified',
    yaxis = list(tickformat = ',d')
  )

complaint_interactive

Total Counts of Evictions and Felonies (2016-2025)

show code
eviction_monthly <- df_full_evictions |>
  select(borough, executed_date) |>
  mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
  filter(
    executed_date >= "2016-01-01" & executed_date <= "2025-12-31",
    !is.na(borough),
    borough != "Citywide"
  ) |>
  mutate(borough = str_to_title(borough)) |>
  mutate(
    month_year = floor_date(executed_date, "month")
  ) |>
  group_by(month_year) |>
  summarise(total_count = n()) |>
  collect() |>
  mutate(type = "Evictions")
 
crime_monthly <- complaint_data_full |>
  select(boro_nm, cmplnt_fr_dt, law_cat_cd) |>
  mutate(cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y")) |>
  dplyr::filter(
    cmplnt_fr_dt >= "2016-01-01" & cmplnt_fr_dt <= "2025-12-31",
    !is.na(boro_nm),
    law_cat_cd == "FELONY"
  ) |>
  mutate(borough = str_to_title(boro_nm)) |>
  mutate(
    month_year = floor_date(cmplnt_fr_dt, "month")
  ) |>
  group_by(month_year) |>
  summarise(total_count = n()) |>
  collect() |>
  mutate(type = "Felonies")

bind_rows(eviction_monthly, crime_monthly) |>
  ggplot(aes(x = month_year, y = total_count, color = type)) +
    geom_line(linewidth = 1) +
    facet_wrap(~ type, ncol = 1, scales = "free_y") +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    scale_color_manual(values = c("Evictions" = "#1f78b4", "Felonies" = "#e31a1c")) +
    labs(
      title = paste("Total Monthly Evictions vs. Felonies in (2016-2025)"),
      x = "Date",
      y = "Total Count"
    ) +
    theme_minimal() +
    theme(legend.position = "none")

Total Counts of Evictions and Felonies in Bronx(2016-2025)

show code
target_borough <- "Bronx"

eviction_monthly <- df_full_evictions |>
  select(borough, executed_date) |>
  mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
  dplyr::filter(
    executed_date >= "2016-01-01" & executed_date <= "2025-12-31",
    !is.na(borough),
    borough != "Citywide"
  ) |>
  mutate(borough = str_to_title(borough)) |>
  dplyr::filter(borough == target_borough) |> 
  mutate(
    month_year = floor_date(executed_date, "month")
  ) |>
  group_by(month_year) |>
  summarise(total_count = n()) |>
  collect() |>
  mutate(type = "Evictions")
 
crime_monthly <- complaint_data_full |>
  select(boro_nm, cmplnt_fr_dt, law_cat_cd) |>
  mutate(cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y")) |>
  dplyr::filter(
    cmplnt_fr_dt >= "2016-01-01" & cmplnt_fr_dt <= "2025-12-31",
    !is.na(boro_nm),
    law_cat_cd == "FELONY"
  ) |>
  mutate(borough = str_to_title(boro_nm)) |>
  dplyr::filter(borough == target_borough) |>
  mutate(
    month_year = floor_date(cmplnt_fr_dt, "month")
  ) |>
  group_by(month_year) |>
  summarise(total_count = n()) |>
  collect() |>
  mutate(type = "Felonies")

bind_rows(eviction_monthly, crime_monthly) |>
  ggplot(aes(x = month_year, y = total_count, color = type)) +
    geom_line(linewidth = 1) +
    facet_wrap(~ type, ncol = 1, scales = "free_y") +
    scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
    scale_color_manual(values = c("Evictions" = "#1f78b4", "Felonies" = "#e31a1c")) +
    labs(
      title = paste("Monthly Evictions vs. Felonies in", target_borough, "(2019-2023)"),
      x = "Date",
      y = "Total Count"
    ) +
    theme_minimal() +
    theme(legend.position = "none")

Eviction vs Felony Hotspots in 2019

show code
eviction_points <- df_full_evictions |>
  select(borough, latitude, longitude, executed_date) |>
  mutate(executed_date = as_date(executed_date, format = "%m/%d/%Y")) |>
  dplyr::filter(
    year(executed_date) == 2019,
    !is.na(latitude), !is.na(longitude)
  ) |>
  mutate(
    latitude = as.numeric(latitude),
    longitude = as.numeric(longitude)
  ) |>
  dplyr::filter(
    latitude > 40.4 & latitude < 40.9,
    longitude > -74.3 & longitude < -73.6
  ) |>
  collect()

crime_points <- complaint_data_full |>
  select(c(boro_nm, latitude, longitude, cmplnt_fr_dt, law_cat_cd)) |>
  mutate(
    cmplnt_fr_dt = as_date(cmplnt_fr_dt, format = "%m/%d/%Y"),
    latitude = as.numeric(latitude),
    longitude = as.numeric(longitude)
  ) |>
  dplyr::filter(
    year(cmplnt_fr_dt) == 2019,
    law_cat_cd == "FELONY",
    !is.na(latitude), !is.na(longitude)
  ) |>
  dplyr::filter(
    latitude > 40.4 & latitude < 40.9,
    longitude > -74.3 & longitude < -73.6
  ) |>
  collect()

plot_evictions <- ggplot() +
  stat_density_2d(
    data = eviction_points,
    aes(x = longitude, y = latitude, fill = ..level..),
    geom = "polygon"
  ) +
  scale_fill_viridis_c(option = "magma") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Eviction Hotspots in 2019")

plot_felonies <- ggplot() +
  stat_density_2d(
    data = crime_points,
    aes(x = longitude, y = latitude, fill = ..level..),
    geom = "polygon"
  ) +
  scale_fill_viridis_c(option = "magma") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Felony Hotspots in 2019")

plot_evictions + plot_felonies

Relation of Crime Complaints and Evictions

show code
crime_by_year_borough <- complaint_data_full |>
  mutate(
    date = mdy(cmplnt_fr_dt),
    year = year(date),
    borough = toupper(boro_nm)
  ) |>
  filter(!is.na(year), year >= 2000, year <= 2025) |>
  group_by(year, borough) |>
  summarize(complaints = n(), .groups = "drop") |>
  mutate(year = as.integer(year)) |>
  complete(
    year = 2000:2025,
    borough = c("MANHATTAN","BROOKLYN","QUEENS","BRONX","STATEN ISLAND"),
    fill = list(complaints = 0)
  )

ev_by_year_boro <- df_full_evictions |>
  mutate(
    exec_date = mdy(executed_date),
    year      = year(exec_date),
    borough   = toupper(borough)
  ) |>
  count(year, borough, name = "evict")

bi <- crime_by_year_borough |>
  left_join(ev_by_year_boro, by = c("year","borough")) |>
  mutate(evict = replace_na(evict, 0)) |>
  group_by(year) |>
  mutate(
    c_q = ntile(complaints, 3),
    e_q = ntile(evict, 3), 
    bi_key = paste(c_q, e_q, sep = "-")
  ) |>
  ungroup() |>
  collect()

nyc_boroughs <- st_read("data/nybb.shp")
Reading layer `nybb' from data source `/home/taras/Documents/Workspace/DS/crime-analysis/data/nybb.shp' using driver `ESRI Shapefile'
Simple feature collection with 5 features and 4 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
Projected CRS: NAD83 / New York Long Island (ftUS)
show code
nyc_boroughs <- nyc_boroughs |>
  mutate(borough = toupper(BoroName)) |>
  select(borough, geometry)

map_bi <- nyc_boroughs |> 
  left_join(bi, by = "borough") |>
  mutate(id = row_number())

bi_pal <- c(
  "1-1"="#e8e8e8","2-1"="#b8d6be","3-1"="#73ae80",
  "1-2"="#bed8ec","2-2"="#90b2b3","3-2"="#5a9178",
  "1-3"="#64acbe","2-3"="#4a7c8c","3-3"="#3b5d70"
)

nyc_boroughs_static <- map_bi |>
  group_by(borough) |>
  slice(1) |>
  ungroup() |>
  select(borough, geometry)

p_bi_static <- ggplot(map_bi) +
  geom_sf(data = nyc_boroughs_static, fill = NA, color = "black", linewidth = 0.5, inherit.aes = FALSE) + 
  geom_sf(aes(fill = bi_key, frame = year, ids = id), color = NA, linewidth = 0.3) +
  scale_fill_manual(values = bi_pal, na.value = "grey80", name = "complaints × evictions") +
  labs(
    title = "Relation of crime complaints and evictions",
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5, margin = margin(t = 5, b = 5)),
    legend.position = "right",
    plot.background = element_rect(fill = "white", color = NA)
  )

p_bi_interactive <- ggplotly(p_bi_static, tooltip = "text")

p_bi_interactive